1. Import the necessary libraries

In [102]:
import  pandas         as pd
from    matplotlib     import pyplot as plt
import  seaborn        as sns
import  scipy.stats    as stats
from    scipy.stats    import shapiro
import  plotly.express as px
from    scipy.stats    import skew 
from    sklearn.preprocessing import LabelEncoder
from    scipy.stats    import mannwhitneyu
from    scipy.stats    import chisquare
from    scipy.stats    import f_oneway

2. Read the data as a data frame

In [2]:
df=pd.read_csv("D:/ML/insurance.csv")
df.head()
Out[2]:
age sex bmi children smoker region charges
0 19 female 27.900 0 yes southwest 16884.92400
1 18 male 33.770 1 no southeast 1725.55230
2 28 male 33.000 3 no southeast 4449.46200
3 33 male 22.705 0 no northwest 21984.47061
4 32 male 28.880 0 no northwest 3866.85520

3 .Perform basic EDA which should include the following and print out your insights at every step.

a. Shape of the data

In [3]:
rows,cols=df.shape
print("The number of rows is {0} and the number of columns is {1}".format(rows, cols))
The number of rows is 1338 and the number of columns is 7

b.Data type of each attribute

In [4]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB

c. Checking the presence of missing values

In [5]:
print(df.isnull().sum())
print("There are no missing values in the dataframe")
age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64
There are no missing values in the dataframe

d. 5 point summary of numerical attributes

In [6]:
df.describe()
Out[6]:
age bmi children charges
count 1338.000000 1338.000000 1338.000000 1338.000000
mean 39.207025 30.663397 1.094918 13270.422265
std 14.049960 6.098187 1.205493 12110.011237
min 18.000000 15.960000 0.000000 1121.873900
25% 27.000000 26.296250 0.000000 4740.287150
50% 39.000000 30.400000 1.000000 9382.033000
75% 51.000000 34.693750 2.000000 16639.912515
max 64.000000 53.130000 5.000000 63770.428010

e. Distribution of ‘bmi’, ‘age’ and ‘charges’ columns

f. Measure of skewness of ‘bmi’, ‘age’ and ‘charges’columns

g. Checking the presence of outliers in ‘bmi’, ‘age’ and ‘charges' columns

In [7]:
if(skew(df['bmi'])>0):
    print("The Distribution is Right skewed with a value of {}".format(skew(df['bmi'])))
elif(skew(df['bmi'])<0):
    print("The Distribution is Left skewed with a value of {}".format(skew(df['bmi'])))
else:
    print("The Distribution is Normal")
fig = px.histogram(df, x="bmi",marginal="box",hover_data=df.columns,)
fig.show()
The Distribution is Right skewed with a value of 0.28372857291709386
In [8]:
if(skew(df['age'])>0):
    print("The Distribution is Right skewed with a value of {}".format(skew(df['age'])))
elif(skew(df['age'])<0):
    print("The Distribution is Left skewed with a value of {}".format(skew(df['age'])))
else:
    print("The Distribution is Normal")
fig = px.histogram(df, x="age",marginal="box",hover_data=df.columns,)
fig.show()
The Distribution is Right skewed with a value of 0.055610083072599126
In [9]:
if(skew(df['charges'])>0):
    print("The Distribution is Right skewed with a value of {}".format(skew(df['charges'])))
elif(skew(df['charges'])<0):
    print("The Distribution is Left skewed with a value of {}".format(skew(df['charges'])))
else:
    print("The Distribution is Normal")
fig = px.histogram(df, x="charges",marginal="box",hover_data=df.columns,)
fig.show()

    
The Distribution is Right skewed with a value of 1.5141797118745743

Distribution of categorical columns (include children)

In [10]:
print(sns.countplot( x="children", data=df))
AxesSubplot(0.125,0.125;0.775x0.755)
In [11]:
print(sns.countplot( x="smoker", data=df))
AxesSubplot(0.125,0.125;0.775x0.755)
In [12]:
print(sns.countplot( x="region", data=df))
AxesSubplot(0.125,0.125;0.775x0.755)
In [13]:
print(sns.countplot( x="sex", data=df))
AxesSubplot(0.125,0.125;0.775x0.755)
In [14]:
col=["sex","smoker","region","children"]
df_copy=df.loc[:,col]
px.parallel_categories(df_copy,color_continuous_scale=["red","yellow","green"])

Pair plot that includes all the columns of the data frame

In [43]:
labelencoder = LabelEncoder()
df['sex_transform'] = labelencoder.fit_transform(df['sex'])
df['region_transform'] = labelencoder.fit_transform(df['region'])
df['smoker_transform'] = labelencoder.fit_transform(df['smoker'])
sns.pairplot(df)
Out[43]:
<seaborn.axisgrid.PairGrid at 0x13a79323248>

4. Answer the following questions with statistical evidence

a. Do charges of people who smoke differ significantly from the people who don't?

In [68]:
gender=df.groupby(["smoker"])
smoker=gender.get_group("yes")["charges"]
non_smoker=gender.get_group("no")["charges"]
sns.distplot(smoker)
sns.distplot(non_smoker)
print(shapiro(smoker))
print(shapiro(non_smoker))
(0.9395521879196167, 3.6251879276250065e-09)
(0.8728628158569336, 1.4455900162299346e-28)
In [76]:
print("As statically proven that both the distributions are not gaussian , we use the non-parametric testing statergy")
stat, p = mannwhitneyu(smoker,non_smoker)
print('stat={}, p={}' .format(stat, p))
if p > 0.05:
    print('Both the distributions are same, there is no difference in charges for smokers and non smokers')
else:
    print('Both the distributions are different, there is a significant difference in charges for smokers and non smokers')
As statically proven that both the distributions are not gaussian , we use the non-parametric testing statergy
stat=7403.0, p=2.6351167222517853e-130
Both the distributions are different, there is a significant difference in charges for smokers and non smokers

b. Does bmi of males differ significantly from that of females?

In [74]:
print(shapiro(male_bmi))
print(shapiro(female_bmi))
gender=df.groupby(["sex"])
male_bmi=gender.get_group("male")["bmi"]
female_bmi=gender.get_group("female")["bmi"]
sns.distplot(male_bmi)
sns.distplot(female_bmi)
(0.9930475354194641, 0.003130641533061862)
(0.9930257797241211, 0.0035433683078736067)
Out[74]:
<matplotlib.axes._subplots.AxesSubplot at 0x13a7e34cec8>
In [77]:
print("As statically proven that both the distributions are not gaussian , we use the non-parametric testing statergy")
stat, p = mannwhitneyu(male_bmi,female_bmi)
print('stat={}, p={}' .format(stat, p))
if p > 0.05:
    print('Both the distributions are same, there is no difference in BMI between Male and Female')
else:
    print('Both the distributions are different, there is a significant difference in in BMI between Male and Female')
As statically proven that both the distributions are not gaussian , we use the non-parametric testing statergy
stat=212180.0, p=0.05070064020142721
Both the distributions are same, there is no difference in BMI between Male and Female

c. Is the proportion of smokers significantly different in different genders?

In [94]:
smoker_gender=pd.crosstab(df['sex'],df['smoker'])
stat, p = chisquare(smoker_gender['yes'])
print('stat={}, p={}' .format(stat, p))
if p > 0.05:
    print('There is no difference in propotion of smokers between Male and Female')
else:
    print('There is a difference in propotion of smokers between Male and Female')
stat=7.065693430656935, p=0.007857389588994759
There is a difference in propotion of smokers between Male and Female

d. Is the distribution of bmi across women with no children, one child and two children, the same ?

In [105]:
bmi=df.groupby("children")
bmi_0=bmi.get_group(0)["bmi"]
bmi_1=bmi.get_group(1)["bmi"]
bmi_2=bmi.get_group(2)["bmi"]
sns.distplot(bmi_0)
sns.distplot(bmi_1)
sns.distplot(bmi_2)
stat, p = f_oneway(bmi_0, bmi_1, bmi_2)
print('stat={}, p={}'.format(stat, p))
if p > 0.05:
    print('There is no significant difference in bmi across women with no children, one child and two children')
else:
    print('There is significant difference in bmi across women with no children, one child and two children')
stat=0.4169829280345703, p=0.6591330886467935
There is no significant difference in bmi across women with no children, one child and two children